# aggregate oa_status -----
oa_status <- papers %>%
filter(!is.na(is_oa)) %>%
select(paperid, SDG_label, year, is_oa, provider_cat)
oa_per_year <- oa_status %>%
count(SDG_label, year, is_oa) %>%
collect()
oa_status_per_year <- oa_status %>%
count(SDG_label, year, provider_cat) %>%
collect()
oa_per_year %>%
group_by(SDG_label, year) %>%
mutate(oa_share = n/sum(n)) %>%
filter(is_oa) %>%
ggplot(aes(as_year(year), oa_share, colour = fix_sdg(SDG_label),
group = fix_sdg(SDG_label))) +
geom_point() +
geom_line() +
scale_y_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
labs(x = NULL, title = "OA share by SDG",
y = NULL, colour = NULL) +
theme(legend.position = "top")
oa_status_per_year %>%
group_by(SDG_label, year) %>%
filter(!is.na(provider_cat), provider_cat != "Not OA") %>%
mutate(oa_share = n/sum(n)) %>%
ggplot(aes(as_year(year), oa_share,
colour = fix_sdg(SDG_label),
group = fix_sdg(SDG_label))) +
geom_point() +
geom_line() +
facet_wrap(vars(provider_cat)) +
scale_y_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
labs(x = NULL, title = "OA share by SDG",
y = NULL, colour = NULL) +
theme(legend.position = "top")
The rise in OA is thus mainly due to rise in gold OA, slightly rise in hybrid. However, unpaywall prefers gold over green: redo this figure with new categorisation.
funder_overview <- papers %>%
# restrict to unpaywall
filter(!is.na(is_oa), !is.na(is_funded)) %>%
group_by(is_funded, year, SDG_label) %>%
summarise(mean_oa = mean(as.numeric(is_oa))) %>%
collect()
funder_overview %>%
ggplot(aes(as_year(year), mean_oa, colour = is_funded)) +
geom_line() +
geom_point() +
facet_wrap(vars(SDG_label)) +
scale_y_continuous(labels = scales::percent) +
labs(x = NULL, y = "% of publications which are OA",
title = "OA by funding status") +
theme(legend.position = "top")
paper_oa_flag <- papers %>%
select(paperid, is_oa, provider_cat, year, SDG_label)
oa_per_funder <- funded_projects %>%
left_join(paper_oa_flag)
## Joining, by = "paperid"
oa_per_funder_aggregated <- oa_per_funder %>%
select(doi, funder_name, year, is_oa) %>%
distinct() %>% # remove duplicate rows since many papers are funded by multiple projects
group_by(year, funder_name) %>%
count(is_oa) %>%
filter(!is.na(is_oa)) %>%
mutate(oa_share = n/sum(n),
total_papers = sum(n)) %>%
collect()
text_labels <- oa_per_funder_aggregated %>%
group_by(funder_name) %>%
summarise(nn_papers = sum(total_papers)) %>%
mutate(label = glue::glue("n = {format(nn_papers, big.mark = '.', decimal.mark = ',')}"))
oa_per_funder_aggregated %>%
filter(is_oa) %>%
ggplot(aes(lubridate::ymd(year, truncated = 2L), oa_share,
group = funder_name)) +
geom_point() +
geom_line() +
geom_text(aes(x = lubridate::ymd("2017-01-01"),
y = .4,
label = label),
data = text_labels) +
facet_wrap(vars(str_wrap(funder_name, 50))) +
scale_y_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
labs(x = NULL, title = "OA share by funder",
y = NULL, colour = NULL) +
theme(legend.position = "top")
Share of OA going down for CA institutes of health. There is an explanation for this in the Nature 2018 paper by Lariviere and Sugimoto.
oa_per_affiliation <- oa_status %>%
left_join(author_paper_affiliations) %>%
right_join(affils)
## Joining, by = "paperid"
## Joining, by = "affiliationid"
oa_per_affiliation_selected <- oa_per_affiliation %>%
group_by(paperid) %>%
mutate(frac_count = 1 / max(authorsequencenumber, na.rm = TRUE)) %>%
select(paperid, authorid, is_oa, provider_cat, year, country, frac_count,
SDG_label)
oa_per_country <- oa_per_affiliation_selected %>%
group_by(country, is_oa) %>%
summarise(sum_frac_oa = sum(frac_count)) %>%
mutate(prop_oa = sum_frac_oa/sum(sum_frac_oa),
sum_frac_total = sum(sum_frac_oa)) %>%
collect()
# use the mean of the 2015-2018 for now to reduce missing data
# maybe better to to lag values (drag forward if missing)
wb_local <- wb_indicators %>%
filter(year >= 2015 & year <= 2018) %>%
group_by(country_name, country_code, indicator_code, indicator_name) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
collect()
proper_countries <- wb_countries %>%
filter(!is.na(`Currency Unit`)) %>%
select(country_code = `Country Code`, short_name = `Short Name`,
region = Region, income_group = `Income Group`)
wb_2015_2018 <- wb_local %>%
select(country_code, country_name, indicator_name, value) %>%
pivot_wider(names_from = "indicator_name", values_from = "value") %>%
right_join(proper_countries)
## Joining, by = "country_code"
cor_matrix <- wb_2015_2018 %>%
select(-starts_with("country"), -short_name, -region, -income_group) %>%
cor(use = "pairwise.complete.obs")
wb_2015_2018 %>%
filter(!is.na(country_name)) %>%
select(-short_name) %>%
vis_miss(cluster = TRUE)
Quite a large portion (about 40%) of ocuntries does not have data for R&D expenditure, even when averaging over the years 2015-2018.
plot_correlation(cor_matrix, cluster = TRUE)
## OA per gdp, % spent on research
oa_with_r_d_gdp <- oa_per_country %>%
left_join(wb_local, by = c("country" = "country_code")) %>%
select(-indicator_name) %>%
filter(indicator_code %in% c("GB.XPD.RSDV.GD.ZS", "NY.GDP.MKTP.KD")) %>%
pivot_wider(names_from = indicator_code, values_from = value) %>%
drop_na() %>%
filter(is_oa)
oa_with_r_d_gdp %>%
ggplot(aes(GB.XPD.RSDV.GD.ZS, prop_oa, colour = NY.GDP.MKTP.KD)) +
geom_point(aes(size = NY.GDP.MKTP.KD), show.legend = FALSE) +
scale_colour_viridis_c(trans = "log", option = "D", end = .9) +
ggrepel::geom_text_repel(aes(label = country_name), show.legend = FALSE) +
labs(x = "Expenditure towards research", y = "OA share") +
scale_y_continuous(labels = scales::percent)
oa_with_gdp_per_cap <- oa_per_country %>%
left_join(wb_local, by = c("country" = "country_code")) %>%
select(-indicator_name) %>%
filter(indicator_code %in% c("NY.GDP.PCAP.KD")) %>%
pivot_wider(names_from = indicator_code, values_from = value) %>%
drop_na() %>%
filter(is_oa)
p <- oa_with_gdp_per_cap %>%
filter(sum_frac_total >= 100) %>%
left_join(proper_countries, by = c("country" = "country_code")) %>%
ggplot(aes(NY.GDP.PCAP.KD, prop_oa)) +
geom_point(aes(size = sum_frac_total)) +
labs(x = "GDP per capita", y = "% of publications which are OA",
size = "# of publications") +
scale_size_continuous(trans = "sqrt", labels = scales::comma) +
scale_y_continuous(labels = scales::percent) +
scale_x_log10(breaks = c(1e+03, 2e+03, 5e+03, 1e+04, 2e+04, 5e+04, 1e+05),
labels = function(x) scales::comma(x, prefix = "$")) +
theme_bw()
p1 <- p +
aes(colour = region)
p1 +
geom_smooth(aes(colour = NULL), alpha = .3, show.legend = FALSE,
colour = "grey30") +
labs(colour = "World region")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
plotly::ggplotly(p1)
p +
geom_smooth(alpha = .3, show.legend = FALSE) +
aes(colour = sum_frac_total) +
ggrepel::geom_text_repel(aes(label = country_name, colour = NULL),
seed = 66324613) +
scale_colour_viridis_c(
trans = "log",
# https://stackoverflow.com/a/20901094/3149349
labels = scales::trans_format("identity",
format = function(x) scales::comma(round(x))))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
oa_per_country_per_SDG <- oa_per_affiliation_selected %>%
group_by(country, SDG_label, is_oa) %>%
summarise(sum_frac_oa = sum(frac_count)) %>%
mutate(prop_oa = sum_frac_oa/sum(sum_frac_oa),
sum_frac_total = sum(sum_frac_oa)) %>%
collect()
oa_sdg_with_gdp_per_cap <- oa_per_country_per_SDG %>%
left_join(wb_local, by = c("country" = "country_code")) %>%
select(-indicator_name) %>%
filter(indicator_code %in% c("NY.GDP.PCAP.KD")) %>%
pivot_wider(names_from = indicator_code, values_from = value) %>%
drop_na() %>%
filter(is_oa)
oa_sdg_with_gdp_per_cap %>%
filter(sum_frac_total >= 100) %>%
left_join(proper_countries, by = c("country" = "country_code")) %>%
ggplot(aes(NY.GDP.PCAP.KD, prop_oa)) +
geom_point(aes(size = sum_frac_total, colour = region)) +
geom_smooth(alpha = .3, show.legend = FALSE, colour = "grey30") +
labs(x = "GDP per capita", y = "% of publications which are OA",
colour = "World region", size = "# of publications") +
scale_size_continuous(trans = "sqrt", labels = scales::comma) +
scale_y_continuous(labels = scales::percent) +
scale_x_log10(breaks = c(1e+03, 2e+03, 5e+03, 1e+04, 2e+04, 5e+04, 1e+05),
labels = function(x) scales::comma(x, prefix = "$")) +
theme_bw() +
facet_wrap(vars(fct_relevel(SDG_label, "SDG_13", after = 3)), nrow = 2) +
theme(legend.position = c(.8, .2))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Pattern is stable across all three SDGs.
Maybe reposition the legend here: https://stackoverflow.com/questions/52060601/ggplot-multiple-legends-arrangement
make_buckets <- function(df) {
df %>%
mutate(year_bucket = case_when(year %in% 2009:2013 ~ "2009-2013",
year %in% 2014:2018 ~ "2014-2018",
TRUE ~ NA)) %>%
filter(!is.na(year_bucket))
}
oa_per_country_year <- oa_per_affiliation_selected %>%
make_buckets() %>%
group_by(country, year_bucket, is_oa) %>%
summarise(sum_frac_oa = sum(frac_count)) %>%
mutate(prop_oa = sum_frac_oa/sum(sum_frac_oa),
sum_frac_total = sum(sum_frac_oa)) %>%
collect()
wb_buckets <- wb_indicators %>%
make_buckets() %>%
group_by(country_name, country_code, indicator_code, indicator_name,
year_bucket) %>%
summarise(value = mean(value, na.rm = TRUE)) %>%
collect()
oa_with_gdp_per_cap <- oa_per_country_year %>%
left_join(wb_buckets, by = c("country" = "country_code", "year_bucket")) %>%
select(-indicator_name) %>%
filter(indicator_code %in% c("NY.GDP.PCAP.KD")) %>% # removing "GB.XPD.RSDV.GD.ZS" because of missing values
pivot_wider(names_from = indicator_code, values_from = value) %>%
drop_na() %>%
filter(is_oa)
pdata <- oa_with_gdp_per_cap %>%
group_by(country) %>%
mutate(all_in = all(sum_frac_total > 100)) %>%
filter(all_in) %>%
ungroup() %>%
select(country, year_bucket, prop_oa,
gdp = NY.GDP.PCAP.KD) %>%
pivot_wider(names_from = year_bucket, values_from = c(prop_oa, gdp)) %>%
mutate(oa_increase = if_else(`prop_oa_2009-2013` < `prop_oa_2014-2018`, TRUE,
FALSE),
oa_diff = `prop_oa_2014-2018` - `prop_oa_2009-2013`) %>%
left_join(proper_countries, by = c("country" = "country_code"))
p <- pdata %>%
ggplot(aes(oa_diff, fct_reorder(region, oa_diff), label = country)) +
geom_boxplot(outlier.alpha = 0, width = .7) +
geom_jitter(height = .11) +
scale_x_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
labs(x = "Percentage point increase in OA publication share",
y = NULL,
title = "Change in OA publication propensity across regions",
subtitle = "Time window: 2009-2013 vs 2014-2018") +
theme_bw()
p
plotly::ggplotly(p)
pdata %>%
ggplot(
aes(oa_diff,
factor(income_group,
levels = c("Low income", "Lower middle income",
"Upper middle income", "High income"))
)
) +
geom_boxplot(outlier.alpha = 0, width = .7) +
geom_jitter(height = .11) +
scale_x_continuous(labels = function(x) scales::percent(x, accuracy = 1)) +
labs(x = "Percentage point increase in OA publication share",
y = NULL,
title = "Change in OA publication propensity across income groups",
subtitle = "Time window: 2009-2013 vs 2014-2018") +
theme_bw()
pdata <- oa_with_gdp_per_cap %>%
group_by(country) %>%
mutate(all_in = all(sum_frac_total > 100)) %>%
filter(all_in) %>%
ungroup() %>%
select(country_name, year_bucket, prop_oa,
gdp = NY.GDP.PCAP.KD) %>%
arrange(country_name) %>%
pivot_wider(names_from = year_bucket, values_from = c(prop_oa, gdp)) %>%
mutate(oa_increase = if_else(`prop_oa_2009-2013` < `prop_oa_2014-2018`, TRUE,
FALSE),
oa_diff = `prop_oa_2014-2018` - `prop_oa_2009-2013`)
p <- ggplot(pdata, aes(x = `gdp_2009-2013`, y = `prop_oa_2009-2013`,
xend = `gdp_2014-2018`, yend = `prop_oa_2014-2018`)) +
geom_segment(arrow = arrow(length = unit(.15, "inches")),
lineend = "round", linejoin = "round",
size = 1) +
scale_y_continuous(labels = scales::percent) +
scale_x_log10(breaks = c(1e+03, 5e+03, 1e+04, 5e+04, 1e+05),
labels = scales::comma) +
theme_bw() +
labs(x = "GDP per capita", y = "% of publications which are OA")
p +
aes(colour = oa_increase)
Only very few countries dropping in terms of OA.
p +
aes(colour = oa_diff*100) +
scale_color_gradient2(mid = "grey75", low = "red") +
labs(colour = "Change in percentage points",
title = "Change in OA from 2009-2013 to 2014-2018") +
theme(legend.position = "top")
Where are biggest increases?